package com.weiron;

import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

/**
 * @author: weiRon
 * @description Flink批处理文件 DataSet方式
 * @date: 2023/4/11 16:25
 */
public class BatchWordCountDemo {
    public static void main(String[] args) {
        // 1.创建执行环境
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

        // 2.从文件读取数据(使用的是DataSet API)
        DataSource<String> lineDataSource = env.readTextFile("input/words.txt");

        // 3.将每行数据进行分词，转换成二元组类型(line 行数据 Collector 收集器)
        FlatMapOperator<String, Tuple2<String, Long>> wordAndOneTuple = lineDataSource
                .flatMap((String line, Collector<Tuple2<String, Long>> out) ->
                {// 将一行文本进行分词
                    String[] words = line.split(" ");
                    // 将每个单词转换成二元组输出
                    for (String word : words) {
                        out.collect(Tuple2.of(word, 1L));
                    }
                }).returns(Types.TUPLE(Types.STRING, Types.LONG));

        // 4.按照 word 进行分组
        UnsortedGrouping<Tuple2<String, Long>> wordAndOneGroup = wordAndOneTuple.groupBy(0);

        // 5.分组内进行聚合统计
        AggregateOperator<Tuple2<String, Long>> sum = wordAndOneGroup.sum(1);

        // 6.打印结果
        try {
            sum.print();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
}
